数据导入

In [96]:
import pandas as pd
In [97]:
df = pd.read_csv("heart.csv")

数据分析

In [98]:
# 查看前五行
df.head()
Out[98]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
0 63 1 3 145 233 1 0 150 0 2.3 0 0 1 1
1 37 1 2 130 250 0 1 187 0 3.5 0 0 2 1
2 41 0 1 130 204 0 0 172 0 1.4 2 0 2 1
3 56 1 1 120 236 0 1 178 0 0.8 2 0 2 1
4 57 0 0 120 354 0 1 163 1 0.6 2 0 2 1
In [99]:
df.shape
Out[99]:
(303, 14)
In [100]:
# 查看后五行
df.tail()
Out[100]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
298 57 0 0 140 241 0 1 123 1 0.2 1 0 3 0
299 45 1 3 110 264 0 1 132 0 1.2 1 0 3 0
300 68 1 0 144 193 1 1 141 0 3.4 1 2 3 0
301 57 1 0 130 131 0 1 115 1 1.2 1 1 3 0
302 57 0 1 130 236 0 0 174 0 0.0 1 1 2 0
In [101]:
# 查看列名
df.columns
Out[101]:
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')
In [102]:
# 查看统计分布
df.describe()
Out[102]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
count 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000
mean 54.366337 0.683168 0.966997 131.623762 246.264026 0.148515 0.528053 149.646865 0.326733 1.039604 1.399340 0.729373 2.313531 0.544554
std 9.082101 0.466011 1.032052 17.538143 51.830751 0.356198 0.525860 22.905161 0.469794 1.161075 0.616226 1.022606 0.612277 0.498835
min 29.000000 0.000000 0.000000 94.000000 126.000000 0.000000 0.000000 71.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 47.500000 0.000000 0.000000 120.000000 211.000000 0.000000 0.000000 133.500000 0.000000 0.000000 1.000000 0.000000 2.000000 0.000000
50% 55.000000 1.000000 1.000000 130.000000 240.000000 0.000000 1.000000 153.000000 0.000000 0.800000 1.000000 0.000000 2.000000 1.000000
75% 61.000000 1.000000 2.000000 140.000000 274.500000 0.000000 1.000000 166.000000 1.000000 1.600000 2.000000 1.000000 3.000000 1.000000
max 77.000000 1.000000 3.000000 200.000000 564.000000 1.000000 2.000000 202.000000 1.000000 6.200000 2.000000 4.000000 3.000000 1.000000
In [103]:
# 查看数据集信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
In [104]:
# 缺失值统计
df.isnull().sum()
Out[104]:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

一行代码产生数据探索性EDA报告

In [105]:
import pandas_profiling
In [106]:
profile = pandas_profiling.ProfileReport(df)
In [107]:
# 生成数据集报告
profile



Out[107]:

In [108]:
# 将报告保存到本地
profile.to_file('profile.html')

数据库可视化分析

In [109]:
import matplotlib.pyplot as plt 
import seaborn as sns
In [110]:
# 特征两两相关性分析
df.corr()
Out[110]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
age 1.000000 -0.098447 -0.068653 0.279351 0.213678 0.121308 -0.116211 -0.398522 0.096801 0.210013 -0.168814 0.276326 0.068001 -0.225439
sex -0.098447 1.000000 -0.049353 -0.056769 -0.197912 0.045032 -0.058196 -0.044020 0.141664 0.096093 -0.030711 0.118261 0.210041 -0.280937
cp -0.068653 -0.049353 1.000000 0.047608 -0.076904 0.094444 0.044421 0.295762 -0.394280 -0.149230 0.119717 -0.181053 -0.161736 0.433798
trestbps 0.279351 -0.056769 0.047608 1.000000 0.123174 0.177531 -0.114103 -0.046698 0.067616 0.193216 -0.121475 0.101389 0.062210 -0.144931
chol 0.213678 -0.197912 -0.076904 0.123174 1.000000 0.013294 -0.151040 -0.009940 0.067023 0.053952 -0.004038 0.070511 0.098803 -0.085239
fbs 0.121308 0.045032 0.094444 0.177531 0.013294 1.000000 -0.084189 -0.008567 0.025665 0.005747 -0.059894 0.137979 -0.032019 -0.028046
restecg -0.116211 -0.058196 0.044421 -0.114103 -0.151040 -0.084189 1.000000 0.044123 -0.070733 -0.058770 0.093045 -0.072042 -0.011981 0.137230
thalach -0.398522 -0.044020 0.295762 -0.046698 -0.009940 -0.008567 0.044123 1.000000 -0.378812 -0.344187 0.386784 -0.213177 -0.096439 0.421741
exang 0.096801 0.141664 -0.394280 0.067616 0.067023 0.025665 -0.070733 -0.378812 1.000000 0.288223 -0.257748 0.115739 0.206754 -0.436757
oldpeak 0.210013 0.096093 -0.149230 0.193216 0.053952 0.005747 -0.058770 -0.344187 0.288223 1.000000 -0.577537 0.222682 0.210244 -0.430696
slope -0.168814 -0.030711 0.119717 -0.121475 -0.004038 -0.059894 0.093045 0.386784 -0.257748 -0.577537 1.000000 -0.080155 -0.104764 0.345877
ca 0.276326 0.118261 -0.181053 0.101389 0.070511 0.137979 -0.072042 -0.213177 0.115739 0.222682 -0.080155 1.000000 0.151832 -0.391724
thal 0.068001 0.210041 -0.161736 0.062210 0.098803 -0.032019 -0.011981 -0.096439 0.206754 0.210244 -0.104764 0.151832 1.000000 -0.344029
target -0.225439 -0.280937 0.433798 -0.144931 -0.085239 -0.028046 0.137230 0.421741 -0.436757 -0.430696 0.345877 -0.391724 -0.344029 1.000000
In [111]:
# 可视化热力图
plt.figure(figsize=(10,10), dpi=400)
sns.heatmap(df.corr(), annot=True, fmt='.1f', square=True)   # annot是否显示数字
plt.show()
In [112]:
# 查看api
sns.heatmap? 
  File "<ipython-input-112-02d25533acc3>", line 2
    sns.heatmap?
               ^
SyntaxError: invalid syntax
In [ ]:
# 绘制两两散点图
sns.pairplot(df)
plt.show()
In [ ]:
# 单个特征统计分布分析
sns.distplot(df['age'])
plt.show()
In [ ]:
df["age"].max()
In [ ]:
df.age.max()
In [ ]:
# 查看不同的数字
df.age.unique()
In [ ]:
# 每个元素出现的次数
df.target.value_counts()
In [26]:
# 对一列的数据集统计分析
sns.countplot(x="target", data=df, palette="bwr")
plt.show()
In [27]:
sns.countplot(x="sex", data=df, palette="mako_r")
plt.xlabel("Sex(0 =female, 1=male)")
plt.show()
In [28]:
# 单列特征与标签的关系
pd.crosstab(df.age, df.target).plot(kind="bar", figsize=(20, 6))
plt.title("Hear Disease Frequency for Ages")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.savefig("heartDiseaseAndAges.png")
plt.show()
In [29]:
# 箱形图
sns.boxplot(x=df.target,y=df.age)
plt.show()
In [30]:
# 小提琴图
sns.violinplot(x=df.target, y=df.age)
plt.show()
In [31]:
# 散点图不同年龄段和不同最大心率的患病情况
plt.scatter(x=df.age[df.target==1], y=df.thalach[df.target==1], c="red")
plt.scatter(x=df.age[df.target==0], y=df.thalach[df.target==0], c="blue")
plt.legend(["Disease", "Not Disease"])
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.show()
In [32]:
# 忽略烦人的红色提示
import warnings
warnings.filterwarnings("ignore")
In [33]:
# 简写列名修改为完整列名
df.columns = ['age', 'sex', 'chest_pain_type','resting_blood_pressure','cholesterol','fasting_blood_sugar','rest_ecg',
              'max_hear_rate_achieced','exercise_induced_angina', 'st_depression',
              'st_slope','num_major_vessels','thalassemin','target']
In [34]:
df.head()
Out[34]:
age sex chest_pain_type resting_blood_pressure cholesterol fasting_blood_sugar rest_ecg max_hear_rate_achieced exercise_induced_angina st_depression st_slope num_major_vessels thalassemin target
0 63 1 3 145 233 1 0 150 0 2.3 0 0 1 1
1 37 1 2 130 250 0 1 187 0 3.5 0 0 2 1
2 41 0 1 130 204 0 0 172 0 1.4 2 0 2 1
3 56 1 1 120 236 0 1 178 0 0.8 2 0 2 1
4 57 0 0 120 354 0 1 163 1 0.6 2 0 2 1

image.png

In [35]:
# 定类特征的整数编码转换成字符串
df["sex"][df.sex == 0] = 'female'
df["sex"][df.sex == 1] = 'male'

df['chest_pain_type'][df.chest_pain_type == 0] = "typical angina"
df['chest_pain_type'][df.chest_pain_type == 1] = "atypical angina"
df['chest_pain_type'][df.chest_pain_type == 2] = "non-anginal pain"
df['chest_pain_type'][df.chest_pain_type == 3] = "asymptomatic"

df["fasting_blood_sugar"][df.fasting_blood_sugar == 0] = "lower than 120mg/ml"
df["fasting_blood_sugar"][df.fasting_blood_sugar == 1] = "greater than 120mg/ml"

df['rest_ecg'][df.rest_ecg == 0] = "normal"
df['rest_ecg'][df.rest_ecg == 1] = "ST-T wave abnormality"
df['rest_ecg'][df.rest_ecg == 2] = "left ventricular hypertrophy"

df['exercise_induced_angina'][df.exercise_induced_angina == 0] = "yes"
df['exercise_induced_angina'][df.exercise_induced_angina == 1] = "no"

df['st_slope'][df.st_slope == 0] ='upsloping'
df['st_slope'][df.st_slope == 1] ='flat'
df['st_slope'][df.st_slope == 2] ='downsloping'

df['thalassemin'][df.thalassemin == 0] = 'unknown'
df['thalassemin'][df.thalassemin == 1] = 'normal'
df['thalassemin'][df.thalassemin == 2] = 'fixed defect'
df['thalassemin'][df.thalassemin == 3] = 'reversable defect'
In [36]:
df.head()
Out[36]:
age sex chest_pain_type resting_blood_pressure cholesterol fasting_blood_sugar rest_ecg max_hear_rate_achieced exercise_induced_angina st_depression st_slope num_major_vessels thalassemin target
0 63 male asymptomatic 145 233 greater than 120mg/ml normal 150 yes 2.3 upsloping 0 normal 1
1 37 male non-anginal pain 130 250 lower than 120mg/ml ST-T wave abnormality 187 yes 3.5 upsloping 0 fixed defect 1
2 41 female atypical angina 130 204 lower than 120mg/ml normal 172 yes 1.4 downsloping 0 fixed defect 1
3 56 male atypical angina 120 236 lower than 120mg/ml ST-T wave abnormality 178 yes 0.8 downsloping 0 fixed defect 1
4 57 female typical angina 120 354 lower than 120mg/ml ST-T wave abnormality 163 no 0.6 downsloping 0 fixed defect 1
In [37]:
# one-hot编码(对obj的进行分列)
df = pd.get_dummies(df)
In [38]:
df.columns
Out[38]:
Index(['age', 'resting_blood_pressure', 'cholesterol',
       'max_hear_rate_achieced', 'st_depression', 'num_major_vessels',
       'target', 'sex_female', 'sex_male', 'chest_pain_type_asymptomatic',
       'chest_pain_type_atypical angina', 'chest_pain_type_non-anginal pain',
       'chest_pain_type_typical angina',
       'fasting_blood_sugar_greater than 120mg/ml',
       'fasting_blood_sugar_lower than 120mg/ml',
       'rest_ecg_ST-T wave abnormality',
       'rest_ecg_left ventricular hypertrophy', 'rest_ecg_normal',
       'exercise_induced_angina_no', 'exercise_induced_angina_yes',
       'st_slope_downsloping', 'st_slope_flat', 'st_slope_upsloping',
       'thalassemin_fixed defect', 'thalassemin_normal',
       'thalassemin_reversable defect', 'thalassemin_unknown'],
      dtype='object')
In [39]:
df.head()
Out[39]:
age resting_blood_pressure cholesterol max_hear_rate_achieced st_depression num_major_vessels target sex_female sex_male chest_pain_type_asymptomatic ... rest_ecg_normal exercise_induced_angina_no exercise_induced_angina_yes st_slope_downsloping st_slope_flat st_slope_upsloping thalassemin_fixed defect thalassemin_normal thalassemin_reversable defect thalassemin_unknown
0 63 145 233 150 2.3 0 1 0 1 1 ... 1 0 1 0 0 1 0 1 0 0
1 37 130 250 187 3.5 0 1 0 1 0 ... 0 0 1 0 0 1 1 0 0 0
2 41 130 204 172 1.4 0 1 1 0 0 ... 1 0 1 1 0 0 1 0 0 0
3 56 120 236 178 0.8 0 1 0 1 0 ... 0 0 1 1 0 0 1 0 0 0
4 57 120 354 163 0.6 0 1 1 0 0 ... 0 1 0 1 0 0 1 0 0 0

5 rows × 27 columns

In [40]:
# 导出处理好的数据集
df.to_csv("process_heart.csv", index=False)

pdpbox工具包可视化

In [41]:
from pdpbox import pdp, get_dataset, info_plots
In [43]:
fig, axed, summary_df = info_plots.target_plot(
    df=df,feature='sex_male', feature_name='gender', target=['target']
)
#_ = axes['bar_ax'].set_xticklabels({"Female", 'Male'})
In [44]:
fig, axed, summary_df = info_plots.target_plot(
    df=df,feature='age', feature_name='age', target=['target']
)
In [45]:
# 两两之间的关系
feat_name1 = 'age'
nick_name1 = 'age'
feat_name2 = 'max_hear_rate_achieced'
nick_name2 = 'max_hart_rate'
fig, axed, summary_df = info_plots.target_plot_interact(
    df=df,features=[feat_name1, feat_name2], feature_names=[nick_name1,nick_name2], target=['target']
)
plt.show()

随机森林做分类

In [46]:
# 除了target列,全部取出来
X = df.drop('target', axis=1)
In [47]:
X.shape
Out[47]:
(303, 26)
In [48]:
Y = df.target
In [49]:
Y
Out[49]:
0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64
In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
In [51]:
X.shape
Out[51]:
(303, 26)
In [52]:
X_test.shape
Out[52]:
(61, 26)
In [53]:
from sklearn.ensemble import RandomForestClassifier
In [54]:
model = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=5)
model.fit(X_train, Y_train)
Out[54]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)
In [55]:
# 指定索引为7的决策树
estimator = model.estimators_[7]
In [56]:
estimator
Out[56]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=2059200585, splitter='best')
In [57]:
feature_names = X_train.columns
Y_train_str = Y_train.astype('str')
Y_train_str[Y_train_str == '0'] = 'no disease'
Y_train_str[Y_train_str == '1'] = 'disease'
Y_train_str = Y_train_str.values
In [58]:
# 决策树可视化
from sklearn.tree import export_graphviz
import os
os.environ["PATH"]+= os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/
export_graphviz(estimator, out_file='tree.dot',
                feature_names=feature_names,
               class_names=Y_train_str,
               rounded=True,proportion=True,
               label='root',
                precision=2,filled=True)

from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

from IPython.display import Image
Image(filename='tree.png')
  File "<ipython-input-58-982442ea883c>", line 4
    os.environ["PATH"]+= os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/
                                                                               ^
SyntaxError: EOL while scanning string literal
In [59]:
# 特征重要性
model.feature_importances_
Out[59]:
array([6.60670254e-02, 3.19336677e-02, 4.18244755e-02, 1.31714707e-01,
       6.23415495e-02, 1.17997012e-01, 2.67430846e-02, 6.48399960e-02,
       1.28094019e-02, 5.48471205e-03, 3.60558136e-02, 1.16175294e-01,
       1.50128557e-02, 6.12513372e-03, 8.04760214e-03, 0.00000000e+00,
       1.32497412e-02, 2.18934601e-02, 6.13392463e-02, 3.14811475e-02,
       2.93918057e-02, 1.10773980e-05, 5.96404701e-02, 2.78530632e-03,
       3.70354150e-02, 0.00000000e+00])
In [60]:
import numpy as np
print("特征排序:")
feature_names = X_test.columns
feature_importances = model.feature_importances_
indices = np.argsort(feature_importances)[::-1]

for index in indices:
    print("feature %s (%f)"%(feature_names[index], feature_importances[index]))
特征排序:
feature max_hear_rate_achieced (0.131715)
feature num_major_vessels (0.117997)
feature chest_pain_type_typical angina (0.116175)
feature age (0.066067)
feature sex_male (0.064840)
feature st_depression (0.062342)
feature exercise_induced_angina_yes (0.061339)
feature thalassemin_fixed defect (0.059640)
feature cholesterol (0.041824)
feature thalassemin_reversable defect (0.037035)
feature chest_pain_type_non-anginal pain (0.036056)
feature resting_blood_pressure (0.031934)
feature st_slope_downsloping (0.031481)
feature st_slope_flat (0.029392)
feature sex_female (0.026743)
feature exercise_induced_angina_no (0.021893)
feature fasting_blood_sugar_greater than 120mg/ml (0.015013)
feature rest_ecg_normal (0.013250)
feature chest_pain_type_asymptomatic (0.012809)
feature rest_ecg_ST-T wave abnormality (0.008048)
feature fasting_blood_sugar_lower than 120mg/ml (0.006125)
feature chest_pain_type_atypical angina (0.005485)
feature thalassemin_normal (0.002785)
feature st_slope_upsloping (0.000011)
feature rest_ecg_left ventricular hypertrophy (0.000000)
feature thalassemin_unknown (0.000000)
In [61]:
# 各个特征的权重
import eli5
eli5.show_weights(estimator, feature_names=feature_names.to_list())
Using TensorFlow backend.
Out[61]:
Weight Feature
0.3552 exercise_induced_angina_yes
0.1244 thalassemin_reversable defect
0.1120 age
0.0900 chest_pain_type_typical angina
0.0856 st_slope_flat
0.0619 sex_female
0.0530 cholesterol
0.0318 fasting_blood_sugar_lower than 120mg/ml
0.0279 thalassemin_normal
0.0167 rest_ecg_normal
0.0165 max_hear_rate_achieced
0.0156 num_major_vessels
0.0058 st_depression
0.0029 chest_pain_type_atypical angina
0.0008 st_slope_downsloping
0 sex_male
0 chest_pain_type_asymptomatic
0 thalassemin_unknown
0 resting_blood_pressure
0 fasting_blood_sugar_greater than 120mg/ml
… 6 more …

exercise_induced_angina_yes <= 0.500 (32.9%) chest_pain_type_typical angina <="0.500" (6.0%) fasting_blood_sugar_lower than 120mg ml (2.0%) rest_ecg_normal (0.7%) ---> 1.000
            rest_ecg_normal > 0.500  (1.3%)  ---> 0.000
        fasting_blood_sugar_lower than 120mg/ml > 0.500  (4.0%)
            chest_pain_type_atypical angina <= 0.500 (2.7%) ---> 1.000
            chest_pain_type_atypical angina > 0.500  (1.3%)
                num_major_vessels <= 1.500 (0.7%) ---> 1.000
                num_major_vessels > 1.500  (0.7%)  ---> 0.000
    chest_pain_type_typical angina > 0.500  (26.8%)
        cholesterol <= 237.500 (8.7%) age <="50.000" (2.0%) cholesterol (0.7%) ---> 0.000
                cholesterol > 188.500  (1.3%)  ---> 1.000
            age > 50.000  (6.7%)
                st_depression <= 0.500 (1.3%) ---> 0.333
                st_depression > 0.500  (5.4%)  ---> 0.000
        cholesterol > 237.500  (18.1%)
            age <= 51.500 (3.4%) st_slope_flat <="0.500" (0.7%) ---> 1.000
                st_slope_flat > 0.500  (2.7%)  ---> 0.000
            age > 51.500  (14.8%)  ---> 0.000
exercise_induced_angina_yes > 0.500  (67.1%)
    thalassemin_reversable defect <= 0.500 (47.7%) age <="57.500" (34.2%) chest_pain_type_typical angina (26.8%) st_slope_downsloping (8.7%) ---> 0.947
                st_slope_downsloping > 0.500  (18.1%)  ---> 1.000
            chest_pain_type_typical angina > 0.500  (7.4%)
                age <= 43.500 (2.0%) ---> 1.000
                age > 43.500  (5.4%)  ---> 0.688
        age > 57.500  (13.4%)
            thalassemin_normal <= 0.500 (11.4%) sex_female <="0.500" (2.7%) ---> 0.143
                sex_female > 0.500  (8.7%)  ---> 0.900
            thalassemin_normal > 0.500  (2.0%)  ---> 0.000
    thalassemin_reversable defect > 0.500  (19.5%)
        cholesterol <= 202.000 (2.7%) ---> 0.000
        cholesterol > 202.000  (16.8%)
            st_slope_flat <= 0.500 (6.7%) max_hear_rate_achieced <="194.500" (6.0%) ---> 0.923
                max_hear_rate_achieced > 194.500  (0.7%)  ---> 0.000
            st_slope_flat > 0.500  (10.1%)
                age <= 55.500 (1.3%) ---> 0.667
                age > 55.500  (8.7%)  ---> 0.167
In [62]:
plt.figure(figsize=(16,8))
plt.title("Feature Importance")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b', rotation=90)
plt.show()

预测分类结果

In [63]:
 X_test.shape
Out[63]:
(61, 26)
In [64]:
X_test.head()
Out[64]:
age resting_blood_pressure cholesterol max_hear_rate_achieced st_depression num_major_vessels sex_female sex_male chest_pain_type_asymptomatic chest_pain_type_atypical angina ... rest_ecg_normal exercise_induced_angina_no exercise_induced_angina_yes st_slope_downsloping st_slope_flat st_slope_upsloping thalassemin_fixed defect thalassemin_normal thalassemin_reversable defect thalassemin_unknown
246 56 134 409 150 1.9 2 1 0 0 0 ... 1 1 0 0 1 0 0 0 1 0
183 58 112 230 165 2.5 1 0 1 0 0 ... 1 0 1 0 1 0 0 0 1 0
229 64 125 309 131 1.8 0 0 1 0 0 ... 0 1 0 0 1 0 0 0 1 0
126 47 112 204 143 0.1 0 0 1 0 0 ... 0 0 1 1 0 0 1 0 0 0
184 50 150 243 128 2.6 0 0 1 0 0 ... 1 0 1 0 1 0 0 0 1 0

5 rows × 26 columns

单个数据预测

In [65]:
test_sample = X_test.iloc[2]
In [66]:
test_sample.shape
Out[66]:
(26,)
In [67]:
# 形成二维数组
test_sample = np.array(test_sample).reshape(1,-1)
In [68]:
test_sample.shape
Out[68]:
(1, 26)
In [69]:
# 二分类定性的分类结果
model.predict(test_sample)
Out[69]:
array([1], dtype=int64)
In [70]:
# 二分类定量的分类结果
model.predict_proba(test_sample)
Out[70]:
array([[0.31108269, 0.68891731]])

全部数据预测

In [71]:
model.predict(X_test)
Out[71]:
array([0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1], dtype=int64)
In [72]:
model.predict_proba(X_test)
Out[72]:
array([[0.66428571, 0.33571429],
       [0.61833333, 0.38166667],
       [0.31108269, 0.68891731],
       [0.31209089, 0.68790911],
       [0.6905424 , 0.3094576 ],
       [0.22041273, 0.77958727],
       [0.34780556, 0.65219444],
       [0.36886891, 0.63113109],
       [0.13705144, 0.86294856],
       [0.82018717, 0.17981283],
       [0.06003598, 0.93996402],
       [0.73095238, 0.26904762],
       [0.26916972, 0.73083028],
       [0.17212339, 0.82787661],
       [0.72667749, 0.27332251],
       [0.17212339, 0.82787661],
       [0.909388  , 0.090612  ],
       [0.98593604, 0.01406396],
       [0.35732323, 0.64267677],
       [0.71292208, 0.28707792],
       [0.93593604, 0.06406396],
       [0.27873161, 0.72126839],
       [0.37863258, 0.62136742],
       [0.18706494, 0.81293506],
       [0.72018717, 0.27981283],
       [0.69565826, 0.30434174],
       [0.98023619, 0.01976381],
       [0.89597763, 0.10402237],
       [0.0867316 , 0.9132684 ],
       [0.97229968, 0.02770032],
       [0.10584577, 0.89415423],
       [0.77958874, 0.22041126],
       [0.9914916 , 0.0085084 ],
       [0.85394067, 0.14605933],
       [0.95512796, 0.04487204],
       [0.68333333, 0.31666667],
       [0.26242857, 0.73757143],
       [0.57464084, 0.42535916],
       [0.17139856, 0.82860144],
       [0.8445219 , 0.1554781 ],
       [0.82539683, 0.17460317],
       [0.7585205 , 0.2414795 ],
       [0.18745915, 0.81254085],
       [0.59214286, 0.40785714],
       [0.07251244, 0.92748756],
       [0.49329187, 0.50670813],
       [0.16329772, 0.83670228],
       [0.17987939, 0.82012061],
       [0.18854167, 0.81145833],
       [0.73932709, 0.26067291],
       [0.56223776, 0.43776224],
       [0.22256944, 0.77743056],
       [0.97785523, 0.02214477],
       [0.78333333, 0.21666667],
       [0.02038478, 0.97961522],
       [0.20855943, 0.79144057],
       [0.13496503, 0.86503497],
       [0.20258816, 0.79741184],
       [0.97229968, 0.02770032],
       [0.97229968, 0.02770032],
       [0.42373047, 0.57626953]])
In [73]:
# 患病的置信度
model.predict_proba(X_test)[:,1]
Out[73]:
array([0.33571429, 0.38166667, 0.68891731, 0.68790911, 0.3094576 ,
       0.77958727, 0.65219444, 0.63113109, 0.86294856, 0.17981283,
       0.93996402, 0.26904762, 0.73083028, 0.82787661, 0.27332251,
       0.82787661, 0.090612  , 0.01406396, 0.64267677, 0.28707792,
       0.06406396, 0.72126839, 0.62136742, 0.81293506, 0.27981283,
       0.30434174, 0.01976381, 0.10402237, 0.9132684 , 0.02770032,
       0.89415423, 0.22041126, 0.0085084 , 0.14605933, 0.04487204,
       0.31666667, 0.73757143, 0.42535916, 0.82860144, 0.1554781 ,
       0.17460317, 0.2414795 , 0.81254085, 0.40785714, 0.92748756,
       0.50670813, 0.83670228, 0.82012061, 0.81145833, 0.26067291,
       0.43776224, 0.77743056, 0.02214477, 0.21666667, 0.97961522,
       0.79144057, 0.86503497, 0.79741184, 0.02770032, 0.02770032,
       0.57626953])
In [74]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

混淆矩阵

In [75]:
from sklearn.metrics import confusion_matrix
In [76]:
confusion_matrix_model = confusion_matrix(Y_test, y_pred)
In [77]:
confusion_matrix_model
Out[77]:
array([[27,  8],
       [ 4, 22]], dtype=int64)
In [78]:
# 混淆矩阵绘制模板
import itertools
def cnf_matrix_plotter(cm, classes):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Oranges)
    plt.title("Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
    
    threshold = cm.max()/2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                horizontalalignment='center',
                color='white'if cm[i,j] > threshold else "black",
                fontsize=25)
    plt.tight_layout()
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.show()
In [79]:
cnf_matrix_plotter(confusion_matrix_model, ['Healthy', 'Disease'])

ROC曲线

In [80]:
y_pred_quant = model.predict_proba(X_test)[:,1]
In [81]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, threshold = roc_curve(Y_test, y_pred_quant)
In [82]:
fpr
Out[82]:
array([0.        , 0.        , 0.        , 0.02857143, 0.02857143,
       0.05714286, 0.05714286, 0.05714286, 0.05714286, 0.08571429,
       0.08571429, 0.11428571, 0.11428571, 0.57142857, 0.57142857,
       0.6       , 0.6       , 0.71428571, 0.71428571, 0.77142857,
       0.77142857, 0.8       , 0.88571429, 1.        ])
In [83]:
tpr
Out[83]:
array([0.        , 0.03846154, 0.19230769, 0.19230769, 0.23076923,
       0.23076923, 0.26923077, 0.34615385, 0.69230769, 0.69230769,
       0.73076923, 0.73076923, 0.84615385, 0.84615385, 0.88461538,
       0.88461538, 0.92307692, 0.92307692, 0.96153846, 0.96153846,
       1.        , 1.        , 1.        , 1.        ])
In [84]:
# 阈值
threshold
Out[84]:
array([1.97961522, 0.97961522, 0.89415423, 0.86503497, 0.86294856,
       0.83670228, 0.82860144, 0.82787661, 0.73757143, 0.73083028,
       0.72126839, 0.68891731, 0.64267677, 0.26904762, 0.26067291,
       0.2414795 , 0.22041126, 0.1554781 , 0.14605933, 0.090612  ,
       0.06406396, 0.04487204, 0.02770032, 0.0085084 ])
In [85]:
# 绘制ROC曲线
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], ls='--', c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title("ROC curse")
plt.xlabel("False Positice Rate(1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.grid(True)
In [86]:
auc(fpr, tpr)
Out[86]:
0.8538461538461538

绘制Permutation Importance图

In [87]:
import eli5
from eli5.sklearn import PermutationImportance

# 打乱训练来看特征重要性
perm = PermutationImportance(model, random_state=1).fit(X_test, Y_test)
eli5.show_weights(perm, feature_names=X_test.columns.tolist())
Out[87]:
Weight Feature
0.0557 ± 0.0393 num_major_vessels
0.0262 ± 0.0161 rest_ecg_normal
0.0230 ± 0.0161 thalassemin_fixed defect
0.0230 ± 0.0334 chest_pain_type_typical angina
0.0197 ± 0.0245 sex_male
0.0098 ± 0.0161 age
0.0066 ± 0.0161 exercise_induced_angina_no
0.0066 ± 0.0161 st_depression
0.0033 ± 0.0131 fasting_blood_sugar_greater than 120mg/ml
0.0033 ± 0.0131 thalassemin_reversable defect
0 ± 0.0000 st_slope_downsloping
0 ± 0.0000 sex_female
0 ± 0.0000 st_slope_upsloping
0 ± 0.0000 thalassemin_unknown
0 ± 0.0000 chest_pain_type_atypical angina
0 ± 0.0000 chest_pain_type_asymptomatic
0 ± 0.0000 rest_ecg_left ventricular hypertrophy
0 ± 0.0000 fasting_blood_sugar_lower than 120mg/ml
0 ± 0.0000 thalassemin_normal
0 ± 0.0000 chest_pain_type_non-anginal pain
… 6 more …

PDP图

PDP图反映了某一特征在不同值变化时对模型预测结果的影响。

In [88]:
fig, axes, summary_df = info_plots.actual_plot(
    model=model, X=X_train, feature='sex_male', feature_name='gender',predict_kwds={}
)
In [89]:
fig, axes, summary_df = info_plots.actual_plot(
    model=model, X=X_train, feature='num_major_vessels', feature_name='gender',predict_kwds={}
)

ICE图

将测试集每一个样本的PDP图单独显示出来

In [90]:
feat_name = 'num_major_vessels'
nick_name = 'num_vessels'
pdp_dist = pdp.pdp_isolate(
    model=model, dataset=X_test, model_features=feature_names, feature=feat_name
)

fig, axes = pdp.pdp_plot(pdp_dist, nick_name, plot_lines=True, frac_to_plot=0.8, plot_pts_dist=True)
In [91]:
pdp_dist = pdp.pdp_isolate(
    model=model, dataset=X_test, model_features=feature_names, feature="max_hear_rate_achieced"
)
fig, axes = pdp.pdp_plot(pdp_dist, 'max_heart_rate')
In [92]:
for each in feature_names:
    feat_name = each
    pdp_dist = pdp.pdp_isolate(
    model=model, dataset=X_test, model_features=feature_names, feature=feat_name
    )
    plt.show()

二维PDP图:特征之间的交互关系分析

In [93]:
feat_name1 = "max_hear_rate_achieced"
nick_name1 = "max_hear_rate"
feat_name2 = "num_major_vessels"
nick_name2 = "num_vessels"

inter1 = pdp.pdp_interact(
    model=model, dataset=X_test, model_features=feature_names, features=[feat_name1, feat_name2]
)

fig, axes = pdp.pdp_interact_plot(
    pdp_interact_out=inter1, feature_names=[nick_name1, nick_name2], plot_type="contour", x_quantile=True, plot_pdp=True)

shap机器学习可解释性分析包

In [95]:
import shap
shap.initjs()
In [113]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
In [114]:
len(shap_values)
Out[114]:
2
In [115]:
shap_values[0].shape
Out[115]:
(61, 26)
In [117]:
shap_values[1].shape
Out[117]:
(61, 26)
In [119]:
# 患病和不患病的平均概率
explainer.expected_value
Out[119]:
array([0.4268595, 0.5731405])

特征重要度

对于某个特征,计算测试集每个病人的该特征的shap值之和,shap越大,特征越重要
In [120]:
shap.summary_plot(shap_values[1], X_test, plot_type='bar')
In [122]:
# 每一行表示一个特征,红色表示该特征的值较高的数据点,越靠右的电表示该特征对患病影响正相关越高
shap.summary_plot(shap_values[1], X_test)
In [126]:
shap.summary_plot(shap_values[1], X_test, plot_type='violin')
In [128]:
# 对于单个病人
idx = 126
patient = X.iloc[idx,:]
In [136]:
patient
Out[136]:
age                                           47.0
resting_blood_pressure                       112.0
cholesterol                                  204.0
max_hear_rate_achieced                       143.0
st_depression                                  0.1
num_major_vessels                              0.0
sex_female                                     0.0
sex_male                                       1.0
chest_pain_type_asymptomatic                   0.0
chest_pain_type_atypical angina                0.0
chest_pain_type_non-anginal pain               0.0
chest_pain_type_typical angina                 1.0
fasting_blood_sugar_greater than 120mg/ml      0.0
fasting_blood_sugar_lower than 120mg/ml        1.0
rest_ecg_ST-T wave abnormality                 1.0
rest_ecg_left ventricular hypertrophy          0.0
rest_ecg_normal                                0.0
exercise_induced_angina_no                     0.0
exercise_induced_angina_yes                    1.0
st_slope_downsloping                           1.0
st_slope_flat                                  0.0
st_slope_upsloping                             0.0
thalassemin_fixed defect                       1.0
thalassemin_normal                             0.0
thalassemin_reversable defect                  0.0
thalassemin_unknown                            0.0
Name: 126, dtype: float64